pacman::p_load(tidyverse, dplyr, lubridate, timetk, ggplot2, modeltime, tidymodels)Take-home Exercise 4 (WIP)
Prototyping Modules for Visual Analytics Shiny Application
1 Overview
2 Loading R packages
3 Data preparation
weather <- read_rds("data/weather_imputed.rds")glimpse(weather)Rows: 7,665
Columns: 10
$ Station <chr> "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", …
$ Date <date> 2021-01-01, 2021-01-02, 2021-01-03, 2021-…
$ `Daily Rainfall Total (mm)` <dbl> 94.4, 114.4, 5.2, 0.0, 0.0, 0.0, 1.6, 12.6…
$ `Mean Temperature (°C)` <dbl> 24.0, 23.0, 23.9, 25.1, 26.9, 26.9, 24.4, …
$ `Maximum Temperature (°C)` <dbl> 26.2, 24.5, 25.3, 27.9, 31.6, 30.3, 26.0, …
$ `Minimum Temperature (°C)` <dbl> 21.5, 21.6, 23.2, 23.1, 24.1, 25.1, 23.8, …
$ `Mean Wind Speed (km/h)` <dbl> 6.2, 5.4, 3.6, 4.1, 5.7, 5.8, 4.9, 6.0, 6.…
$ `Max Wind Speed (km/h)` <dbl> 34.3, 30.2, 12.2, 16.9, 28.2, 25.6, 18.0, …
$ LAT <dbl> 1.3764, 1.3764, 1.3764, 1.3764, 1.3764, 1.…
$ LONG <dbl> 103.8492, 103.8492, 103.8492, 103.8492, 10…
| Variable | Description |
|---|---|
| Station | Consist of 7 station across Singapore , namely: Ang Mo Kio, Changi, Choa Chu Kang (South), East Coast Parkway, Jurong (West), Pulau Ubin, Seletar |
| Date | 1 January 2021 to 31 December 2023 (3 years) |
| Daily Rainfall Total (mm) | crucial weather metric, especially when analyzing weather patterns in a region like Singapore, known for its tropical climate with significant rainfall throughout the year |
| Mean Temperature (°C) | average temperature for a day and is calculated by averaging the temperatures measured at various points throughout the day |
| Maximum Temperature (°C) | lowest temperature recorded within a 24-hour period |
| Minimum Temperature (°C) | lowest temperature recorded within a 24-hour period |
unique_stations <- unique(weather$Station)
print(unique_stations)[1] "Ang Mo Kio" "Changi" "Choa Chu Kang (South)"
[4] "East Coast Parkway" "Jurong (West)" "Pulau Ubin"
[7] "Seletar"
The data table shows daily data for above 7 stations, the following function from timetk packages will be utilised for further time series data wrangling:
summarise_by_time()- to aggregate by a period
filter_by_time()- to filter a continuous time range
4 Prototype
4.1 Time Series Analysis
4.1.1 Simple time series plot
# Facet plot of different station
weather %>%
group_by(Station) %>% # facet
filter_by_time(Date, "2021-01", "2023-12") %>% #specify period
plot_time_series(Date,
`Mean Temperature (°C)`,
.facet_ncol = 3,
.smooth= FALSE) weather %>%
filter(Station == "Ang Mo Kio") %>% # specify station
filter_by_time(Date, "2021-01", "2023-12") %>% # specify period # for >1yr not stationary
plot_time_series(Date,
`Mean Temperature (°C)`,
.smooth = TRUE,
.plotly_slider = TRUE)The following plot shows the time series in months.
weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2021-01", "2023-12") %>%
summarise_by_time(Date,
.by = "month", #aggregated by month
`Mean Temperature (°C)` = round(mean(`Mean Temperature (°C)`),2)) %>%
plot_time_series(Date, `Mean Temperature (°C)`, .smooth = TRUE,
.plotly_slider = TRUE)The following plot highlight the different months using different colour. (note: it can only be applied for a period less than 1 year)
weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2023-01", "2023-12") %>%
plot_time_series(Date, `Mean Temperature (°C)`,
.smooth = TRUE,
.smooth_size = 0.8,
.smooth_alpha = 0.8,
.color_var = month(Date, label= TRUE),
.plotly_slider = TRUE,
.title = "Mean temperature for Ang Mo Kio",
.y_lab = "(°C)",
.color_lab = "Month") 4.1.2 ACF Diagnostics
ACF and PACF plot of daily data for a specific station
weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2021-01", "2023-12") %>%
summarise_by_time(Date,
.by = "day", # day, month
`Mean Temperature (°C)` = round(mean(`Mean Temperature (°C)`),2)) %>%
plot_acf_diagnostics(Date, `Mean Temperature (°C)`,
.lags = "3 years") # period ACF and PACF plot of monthly aggregated data for a specific station
weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2021-01", "2023-12") %>%
summarise_by_time(Date,
.by = "month", # day, month
`Mean Temperature (°C)` = round(mean(`Mean Temperature (°C)`),2)) %>%
plot_acf_diagnostics(Date, `Mean Temperature (°C)`,
.lags = "3 years") # period Facet ACF and PACF plot of monthly aggregated data
weather %>%
group_by(Station) %>% # facet
filter_by_time(Date, "2021-01", "2023-12") %>%
summarise_by_time(Date,
.by = "month",
`Mean Temperature (°C)` = round(mean(`Mean Temperature (°C)`),2)) %>%
plot_acf_diagnostics(Date, `Mean Temperature (°C)`,
.facet_ncol = 1,
.lags = "3 years")4.1.3 Seasonal and Trend decomposition using Loess
weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2023-01", "2023-12") %>% #using 1 year
plot_stl_diagnostics(Date, `Mean Temperature (°C)`,
# Set features to return, desired frequency and trend
.feature_set = c("observed", "season", "trend", "remainder"),
.frequency = "auto",
.trend = "auto",
.interactive = FALSE)
4.1.4 Seasonality Diagnostics
weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2023-01", "2023-12") %>% #using 1 year
plot_seasonal_diagnostics(Date, `Mean Temperature (°C)`,
.feature_set = c("week", "month.lbl", "quarter"),
.geom = "boxplot",
.interactive = TRUE)weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2023-01", "2023-12") %>% #using 1 year
plot_seasonal_diagnostics(Date, `Daily Rainfall Total (mm)`,
.feature_set = c("week", "month.lbl", "quarter"),
.geom = "boxplot",
.interactive = TRUE)4.1.5 Anomaly Detection
weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2023-01", "2023-12") %>% #using 1 year
anomalize(.date_var = Date,
.value = `Mean Temperature (°C)`,
.iqr_alpha = 0.10, #Controls the width of the "normal" range
# .max_anomalies = 0.10,
# .message = FALSE
) %>%
plot_anomalies(Date,
.ribbon_alpha = 0.15,
.interactive = TRUE)4.2 Time series forecasting
In this section, we will perform forecasting on the weather data set using modeltime.
Following steps will be used to build forecasting mode:
- Split data into training and test sets
- Create & Fit Multiple Models
- Add fitted models to a Model Table
- Calibrate the models to a testing set.
- Perform Testing Set Forecast & Accuracy Evaluation
- Refit the models to Full Dataset & Forecast Forward
Step 1 to Step 6 below is done using data set from 1 January 2021 to 31 December 2023, as using data set from 1 January 2023 to 31 December 2023.
4.2.1 Step 1 - Split data into training and test sets
weather_AMK <- weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2021-01", "2023-12")
weather_AMK %>%
plot_time_series(Date, `Mean Temperature (°C)`)weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2021-01", "2023-12") %>%
summarise_by_time(Date,
.by = "day", # day, month
`Mean Temperature (°C)` = round(mean(`Mean Temperature (°C)`),2)) %>%
plot_acf_diagnostics(Date, `Mean Temperature (°C)`,
.lags = "3 years") # period # Split Data 80/20
splits <- initial_time_split(weather_AMK, prop = 0.8)splits<Training/Testing/Total>
<876/219/1095>
4.2.2 Step 2 - Create & Fit Multiple Models
Model 1: Auto ARIMA
model_fit_arima_no_boost <- arima_reg() %>%
set_engine(engine = "auto_arima") %>%
fit(`Mean Temperature (°C)` ~ Date, data = training(splits))Model 2: Boosted Auto ARIMA
model_fit_arima_boosted <- arima_boost(
min_n = 2,
learn_rate = 0.015
) %>%
set_engine(engine = "auto_arima_xgboost") %>%
fit(`Mean Temperature (°C)` ~ Date +
as.numeric(Date) + factor(month(Date, label = TRUE), ordered = F),
data = training(splits))Model 3: Exponential Smoothing
model_fit_ets <- exp_smoothing() %>%
set_engine(engine = "ets") %>%
fit(`Mean Temperature (°C)` ~ Date , data = training(splits))Model 4: Linear Regression
model_fit_lm <- linear_reg() %>%
set_engine("lm") %>%
fit(`Mean Temperature (°C)` ~ as.numeric(Date)
+ factor(month(Date, label = TRUE), ordered = FALSE),
data = training(splits))4.2.3 Step 3 - Add fitted models to a Model Table
models_tbl <- modeltime_table(
model_fit_arima_no_boost,
model_fit_arima_boosted,
model_fit_ets,
model_fit_lm
)
models_tbl# Modeltime Table
# A tibble: 4 × 3
.model_id .model .model_desc
<int> <list> <chr>
1 1 <fit[+]> ARIMA(2,1,2)
2 2 <fit[+]> ARIMA(1,1,2) W/ XGBOOST ERRORS
3 3 <fit[+]> ETS(A,N,N)
4 4 <fit[+]> LM
4.2.4 Step 4 - Calibrate the model to a testing set
calibration_tbl <- models_tbl %>%
modeltime_calibrate(new_data = testing(splits))
calibration_tbl# Modeltime Table
# A tibble: 4 × 5
.model_id .model .model_desc .type .calibration_data
<int> <list> <chr> <chr> <list>
1 1 <fit[+]> ARIMA(2,1,2) Test <tibble [219 × 4]>
2 2 <fit[+]> ARIMA(1,1,2) W/ XGBOOST ERRORS Test <tibble [219 × 4]>
3 3 <fit[+]> ETS(A,N,N) Test <tibble [219 × 4]>
4 4 <fit[+]> LM Test <tibble [219 × 4]>
4.2.5 Step 5 - Testing Set Forecast & Accuracy Evaluation
There are 2 critical parts to an evaluation:
- Visualizing the Forecast vs Test Data Set
- Evaluating the Test (Out of Sample) Accuracy
5A - Visualizing the Forecast Test Visualizing the Test Error is easy to do using the interactive plotly visualization (just toggle the visibility of the models using the Legend).
calibration_tbl %>%
modeltime_forecast(
new_data = testing(splits),
actual_data = weather_AMK
) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
# .interactive = interactive,
.plotly_slider = TRUE
)5B - Accuracy Metrics We can use modeltime_accuracy() to collect common accuracy metrics. The default reports the following metrics using yardstick functions:
MAE - Mean absolute error, mae() MAPE - Mean absolute percentage error, mape() MASE - Mean absolute scaled error, mase() SMAPE - Symmetric mean absolute percentage error, smape() RMSE - Root mean squared error, rmse() RSQ - R-squared, rsq()
calibration_tbl %>%
modeltime_accuracy() %>%
table_modeltime_accuracy(
# .interactive = interactive
)4.2.6 Step 6 - Refit to Full Dataset & Forecast Forward
refit_tbl <- calibration_tbl %>%
modeltime_refit(data = weather_AMK)
refit_tbl %>%
modeltime_forecast(h = "10 days", actual_data = weather_AMK) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
# .interactive = interactive,
.plotly_slider = TRUE
)4.2.7 Repeat step 1 to 6 for 12 months
4.2.7.1 Step 1 - Split data into training and test sets
weather_AMK_2023 <- weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2023-01", "2023-12")
weather_AMK_2023 %>%
plot_time_series(Date, `Mean Temperature (°C)`)weather %>%
filter(Station == "Ang Mo Kio") %>%
filter_by_time(Date, "2023-01", "2023-12") %>%
summarise_by_time(Date,
.by = "day", # day, month
`Mean Temperature (°C)` = round(mean(`Mean Temperature (°C)`),2)) %>%
plot_acf_diagnostics(Date, `Mean Temperature (°C)`,
.lags = "1 years") # period # Split Data 80/20
splits_2023 <- initial_time_split(weather_AMK_2023, prop = 0.8)splits_2023<Training/Testing/Total>
<292/73/365>
4.2.7.2 Step 2 - Create & Fit Multiple Models
Model 1: Auto ARIMA
model_fit_arima_no_boost_2023 <- arima_reg() %>%
set_engine(engine = "auto_arima") %>%
fit(`Mean Temperature (°C)` ~ Date, data = training(splits_2023))Model 2: Boosted Auto ARIMA
model_fit_arima_boosted_2023 <- arima_boost(
min_n = 2,
learn_rate = 0.015
) %>%
set_engine(engine = "auto_arima_xgboost") %>%
fit(`Mean Temperature (°C)` ~ Date +
as.numeric(Date) + factor(month(Date, label = TRUE), ordered = F),
data = training(splits_2023))Model 3: Exponential Smoothing
model_fit_ets_2023 <- exp_smoothing() %>%
set_engine(engine = "ets") %>%
fit(`Mean Temperature (°C)` ~ Date , data = training(splits_2023))Model 4: Linear Regression
model_fit_lm_2023 <- linear_reg() %>%
set_engine("lm") %>%
fit(`Mean Temperature (°C)` ~ as.numeric(Date)
+ factor(month(Date, label = TRUE), ordered = FALSE),
data = training(splits_2023))4.2.7.3 Step 3 - Add fitted models to a Model Table
models_tbl_2023 <- modeltime_table(
model_fit_arima_no_boost_2023,
model_fit_arima_boosted_2023,
model_fit_ets_2023,
model_fit_lm_2023
)
models_tbl_2023# Modeltime Table
# A tibble: 4 × 3
.model_id .model .model_desc
<int> <list> <chr>
1 1 <fit[+]> ARIMA(1,1,2)(1,0,0)[7]
2 2 <fit[+]> ARIMA(1,1,2) W/ XGBOOST ERRORS
3 3 <fit[+]> ETS(A,N,N)
4 4 <fit[+]> LM
4.2.7.4 Step 4 - Calibrate the model to a testing set
calibration_tbl_2023 <- models_tbl %>%
modeltime_calibrate(new_data = testing(splits_2023))
calibration_tbl_2023# Modeltime Table
# A tibble: 4 × 5
.model_id .model .model_desc .type .calibration_data
<int> <list> <chr> <chr> <list>
1 1 <fit[+]> ARIMA(2,1,2) Test <tibble [73 × 4]>
2 2 <fit[+]> ARIMA(1,1,2) W/ XGBOOST ERRORS Test <tibble [73 × 4]>
3 3 <fit[+]> ETS(A,N,N) Test <tibble [73 × 4]>
4 4 <fit[+]> LM Test <tibble [73 × 4]>
4.2.7.5 Step 5 - Testing Set Forecast & Accuracy Evaluation
There are 2 critical parts to an evaluation:
- Visualizing the Forecast vs Test Data Set
- Evaluating the Test (Out of Sample) Accuracy
5A - Visualizing the Forecast Test Visualizing the Test Error is easy to do using the interactive plotly visualization (just toggle the visibility of the models using the Legend).
calibration_tbl_2023 %>%
modeltime_forecast(
new_data = testing(splits_2023),
actual_data = weather_AMK_2023
) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
# .interactive = interactive,
.plotly_slider = TRUE
)5B - Accuracy Metrics We can use modeltime_accuracy() to collect common accuracy metrics. The default reports the following metrics using yardstick functions:
MAE - Mean absolute error, mae() MAPE - Mean absolute percentage error, mape() MASE - Mean absolute scaled error, mase() SMAPE - Symmetric mean absolute percentage error, smape() RMSE - Root mean squared error, rmse() RSQ - R-squared, rsq()
calibration_tbl_2023 %>%
modeltime_accuracy() %>%
table_modeltime_accuracy(
# .interactive = interactive
)4.2.7.6 Step 6 - Refit to Full Dataset & Forecast Forward
refit_tbl_2023 <- calibration_tbl_2023 %>%
modeltime_refit(data = weather_AMK)
refit_tbl_2023 %>%
modeltime_forecast(h = "10 days", actual_data = weather_AMK_2023) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
# .interactive = interactive,
.plotly_slider = TRUE
)4.2.8 Comparing ARIMA
This section compares the result of auto arima and arima for the period of 1 January 2021 to 31 December 2023.
# Auto arima
model_fit_autoarima <- arima_reg() %>%
set_engine(engine = "auto_arima") %>%
fit(`Mean Temperature (°C)` ~ Date, data = training(splits))
model_fit_autoarimaparsnip model object
Series: outcome
ARIMA(2,1,2)
Coefficients:
ar1 ar2 ma1 ma2
0.7994 -0.0464 -1.3154 0.3494
s.e. 0.2239 0.1193 0.2201 0.2042
sigma^2 = 0.8802: log likelihood = -1184.2
AIC=2378.4 AICc=2378.47 BIC=2402.27
# Manual arima
model_fit_manualarima <- arima_reg(
seasonal_period = 12,
non_seasonal_ar = 3,
non_seasonal_differences = 1,
non_seasonal_ma = 3,
seasonal_ar = 1,
seasonal_differences = 0,
seasonal_ma = 1
) %>%
set_engine("arima") %>%
fit(`Mean Temperature (°C)` ~ Date, data = training(splits))
model_fit_manualarimaparsnip model object
Series: outcome
ARIMA(3,1,3)(1,0,1)[12]
Coefficients:
ar1 ar2 ar3 ma1 ma2 ma3 sar1 sma1
-0.1952 0.6429 0.0180 -0.3194 -0.8579 0.2483 0.4129 -0.3888
s.e. 1.3228 0.2301 0.3318 1.3194 0.7054 0.5527 4.6926 4.6940
sigma^2 = 0.8824: log likelihood = -1183.32
AIC=2384.65 AICc=2384.85 BIC=2427.61
models_tbl_1 <- modeltime_table(
model_fit_autoarima,
model_fit_manualarima
)
models_tbl_1# Modeltime Table
# A tibble: 2 × 3
.model_id .model .model_desc
<int> <list> <chr>
1 1 <fit[+]> ARIMA(2,1,2)
2 2 <fit[+]> ARIMA(3,1,3)(1,0,1)[12]
calibration_tbl_1 <- models_tbl_1 %>%
modeltime_calibrate(new_data = testing(splits))
calibration_tbl_1# Modeltime Table
# A tibble: 2 × 5
.model_id .model .model_desc .type .calibration_data
<int> <list> <chr> <chr> <list>
1 1 <fit[+]> ARIMA(2,1,2) Test <tibble [219 × 4]>
2 2 <fit[+]> ARIMA(3,1,3)(1,0,1)[12] Test <tibble [219 × 4]>
calibration_tbl_1 %>%
modeltime_forecast(
new_data = testing(splits),
actual_data = weather_AMK
) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
# .interactive = interactive,
.plotly_slider = TRUE
)calibration_tbl_1 %>%
modeltime_accuracy() %>%
table_modeltime_accuracy(
# .interactive = interactive
)calibration_tbl_1 %>%
modeltime_residuals() %>%
plot_modeltime_residuals(.interactive = TRUE)refit_tbl_1 <- calibration_tbl_1 %>%
modeltime_refit(data = weather_AMK)
refit_tbl_1 %>%
modeltime_forecast(h = "10 days", actual_data = weather_AMK) %>%
plot_modeltime_forecast(
.legend_max_width = 25, # For mobile screens
# .interactive = interactive,
.plotly_slider = TRUE
)